-- Create a SAS credential (include the leading '?') to access the Blob container with patents data
CREATE DATABASE SCOPED CREDENTIAL blob_sas
WITH IDENTITY = 'SHARED ACCESS SIGNATURE',
     SECRET   = '?sv=2024-...&ss=bfqt&srt=co&sp=rl...&sig=...';
GO

--  Create the PolyBase data source pointing at the container
CREATE EXTERNAL DATA SOURCE MyBlob
WITH (
    TYPE = HADOOP,
    LOCATION   = 'wasbs://[container name]@[blob name].blob.core.windows.net',
    CREDENTIAL = blob_sas
);
GO

-- Parquet, in case you CTAS to lake
IF NOT EXISTS (SELECT 1 FROM sys.external_file_formats WHERE name = 'ParquetFmt')
    CREATE EXTERNAL FILE FORMAT ParquetFmt
    WITH (FORMAT_TYPE = PARQUET);
GO


/* =======================================================================
   0) One-time: quoted TSV external file format (if you don't have it yet)
   ======================================================================= */
IF NOT EXISTS (SELECT 1 FROM sys.external_file_formats WHERE name = 'CsvFmtQ')
BEGIN
    CREATE EXTERNAL FILE FORMAT CsvFmtQ
    WITH (
        FORMAT_TYPE = DELIMITEDTEXT,
        FORMAT_OPTIONS (
            FIELD_TERMINATOR = '\t',
            STRING_DELIMITER = '"',
            ENCODING = 'UTF8',
            USE_TYPE_DEFAULT = FALSE
        )
    );
END
GO


/* =========================================================
   1) External tables (raw)  — all columns as text where risky
   ========================================================= */

-- 1.1 application.tsv
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'Application_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.Application_xt;');
GO
CREATE EXTERNAL TABLE dbo.Application_xt
(
    id               NVARCHAR(4000),
    patent_id        NVARCHAR(4000),
    series_code      NVARCHAR(4000),
    number           NVARCHAR(4000),
    country          NVARCHAR(4000),
    applicationdate  NVARCHAR(4000)
)
WITH (
    LOCATION     = '/application.tsv',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmtQ,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- 1.2 patent.tsv
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'Patents_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.Patents_xt;');
GO
CREATE EXTERNAL TABLE dbo.Patents_xt
(
    id            NVARCHAR(4000),
    [type]        NVARCHAR(4000),
    [number]      NVARCHAR(4000),
    country       NVARCHAR(4000),
    grantdate     NVARCHAR(4000),
    abstracttext  NVARCHAR(4000),
    title         NVARCHAR(4000),
    kind          NVARCHAR(4000),
    num_claims    NVARCHAR(4000),
    filename      NVARCHAR(4000),
    withdrawn     NVARCHAR(4000)
)
WITH (
    LOCATION     = '/patent.tsv',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmtQ,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- 1.3 cpc_current.tsv
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'Cpc_current_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.Cpc_current_xt;');
GO
CREATE EXTERNAL TABLE dbo.Cpc_current_xt
(
    uuid           NVARCHAR(4000),
    patent_id      NVARCHAR(4000),
    section_id     NVARCHAR(4000),
    subsection_id  NVARCHAR(4000),
    group_id       NVARCHAR(4000),
    subgroup_id    NVARCHAR(4000),
    category       NVARCHAR(4000),
    sequence       NVARCHAR(4000)
)
WITH (
    LOCATION     = '/cpc_current.tsv',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmtQ,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- 1.4 uspatentcitation.tsv
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'Patent_citations_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.Patent_citations_xt;');
GO
CREATE EXTERNAL TABLE dbo.Patent_citations_xt
(
    uuid              NVARCHAR(4000),
    patent_id         NVARCHAR(4000),  -- citing
    citation_id       NVARCHAR(4000),  -- cited
    citationgrantdate NVARCHAR(4000),
    citationname      NVARCHAR(4000),
    citationkind      NVARCHAR(4000),
    citationcountry   NVARCHAR(4000),
    citationcategory  NVARCHAR(4000),
    citationsequence  NVARCHAR(4000)
)
WITH (
    LOCATION     = '/uspatentcitation.tsv',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmtQ,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO


/* =========================================================
   2) Internal, typed & distributed tables for fast joins
   ========================================================= */

-- 2.1 Application_int (typed + AppYear), HASH by patent_id
IF OBJECT_ID(N'dbo.Application_int', N'U') IS NOT NULL DROP TABLE dbo.Application_int;
GO
CREATE TABLE dbo.Application_int
WITH (
    DISTRIBUTION = HASH(patent_id),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT
    TRY_CAST(id AS BIGINT)                       AS id,
    CAST(patent_id AS NVARCHAR(100))             AS patent_id,
    CAST(series_code AS NVARCHAR(20))            AS series_code,
    CAST([number] AS NVARCHAR(100))              AS [number],
    CAST(country AS NVARCHAR(20))                AS country,
    applicationdate,
    TRY_CONVERT(DATE, applicationdate)           AS application_date,
    YEAR(TRY_CONVERT(DATE, applicationdate))     AS AppYear
FROM dbo.Application_xt
WHERE TRY_CAST(id AS BIGINT) IS NOT NULL;
GO
CREATE STATISTICS st_app_pid  ON dbo.Application_int(patent_id);
CREATE STATISTICS st_app_year ON dbo.Application_int(AppYear);
GO

-- 2.2 Patents_int (only fields you actually use), HASH by number
IF OBJECT_ID(N'dbo.Patents_int', N'U') IS NOT NULL DROP TABLE dbo.Patents_int;
GO
CREATE TABLE dbo.Patents_int
WITH (
    DISTRIBUTION = HASH([number]),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT
    TRY_CAST(id AS BIGINT)                AS id,
    CAST([type] AS NVARCHAR(50))          AS [type],
    CAST([number] AS NVARCHAR(100))       AS [number],
    CAST(country AS NVARCHAR(20))         AS country,
    TRY_CONVERT(DATE, grantdate)          AS grantdate,
    TRY_CAST(num_claims AS INT)           AS num_claims
FROM dbo.Patents_xt
WHERE TRY_CAST(id AS BIGINT) IS NOT NULL;
GO
CREATE STATISTICS st_pat_num ON dbo.Patents_int([number]);
GO

-- 2.3 Cpc_current_int (only needed columns), HASH by patent_id
IF OBJECT_ID(N'dbo.Cpc_current_int', N'U') IS NOT NULL DROP TABLE dbo.Cpc_current_int;
GO
CREATE TABLE dbo.Cpc_current_int
WITH (
    DISTRIBUTION = HASH(patent_id),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT
    TRY_CAST(uuid AS BIGINT)          AS uuid,
    CAST(patent_id AS NVARCHAR(100))  AS patent_id,
    CAST(group_id  AS NVARCHAR(50))   AS group_id
FROM dbo.Cpc_current_xt
WHERE patent_id IS NOT NULL AND group_id IS NOT NULL;
GO
CREATE STATISTICS st_cpc_pid ON dbo.Cpc_current_int(patent_id);
CREATE STATISTICS st_cpc_gid ON dbo.Cpc_current_int(group_id);
GO

-- 2.4 Patent_citations internal copies for local joins on either key
-- 2.4a HASH by citing patent_id
IF OBJECT_ID(N'dbo.Patent_citations_byCiting_int', N'U') IS NOT NULL
    DROP TABLE dbo.Patent_citations_byCiting_int;
GO
CREATE TABLE dbo.Patent_citations_byCiting_int
WITH (DISTRIBUTION = HASH(patent_id), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT
    CAST(patent_id   AS NVARCHAR(100)) AS patent_id,
    CAST(citation_id AS NVARCHAR(100)) AS citation_id
FROM dbo.Patent_citations_xt
WHERE patent_id IS NOT NULL AND citation_id IS NOT NULL;
GO
CREATE STATISTICS st_cit_citing ON dbo.Patent_citations_byCiting_int(patent_id);
GO

-- 2.4b HASH by cited citation_id
IF OBJECT_ID(N'dbo.Patent_citations_byCited_int', N'U') IS NOT NULL
    DROP TABLE dbo.Patent_citations_byCited_int;
GO
CREATE TABLE dbo.Patent_citations_byCited_int
WITH (DISTRIBUTION = HASH(citation_id), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT
    CAST(patent_id   AS NVARCHAR(100)) AS patent_id,
    CAST(citation_id AS NVARCHAR(100)) AS citation_id
FROM dbo.Patent_citations_xt
WHERE patent_id IS NOT NULL AND citation_id IS NOT NULL;
GO
CREATE STATISTICS st_cit_cited ON dbo.Patent_citations_byCited_int(citation_id);
GO


/* =========================================================
   3) Base views (PatentsData, CitesWithYears, etc.)
   ========================================================= */

-- 3.1 PatentsData (your @PatentsData)
IF OBJECT_ID(N'dbo.PatentsData', N'V') IS NOT NULL DROP VIEW dbo.PatentsData;
GO
CREATE VIEW dbo.PatentsData AS
SELECT
    A.patent_id,
    A.applicationdate,
    A.AppYear,
    A.country,
    P.grantdate,
    P.num_claims,
    P.[type]
FROM dbo.Application_int AS A
JOIN dbo.Patents_int     AS P
  ON A.patent_id = P.[number];
GO

-- 3.2 CitesWithYears (application years for both sides)
IF OBJECT_ID(N'dbo.CitesWithYears', N'V') IS NOT NULL DROP VIEW dbo.CitesWithYears;
GO
CREATE VIEW dbo.CitesWithYears AS
SELECT
    C.citation_id  AS patent_id,     -- focal (cited)
    CA.AppYear     AS PatentAppYear, -- focal's application year
    PA.AppYear     AS CitingAppYear
FROM dbo.Patent_citations_byCited_int AS C
JOIN dbo.PatentsData AS CA  ON CA.patent_id = C.citation_id
JOIN dbo.PatentsData AS PA  ON PA.patent_id = C.patent_id;
GO

-- 3.3 Forward citation counts by window
IF OBJECT_ID(N'dbo.PatentFCites', N'V') IS NOT NULL DROP VIEW dbo.PatentFCites;
GO
CREATE VIEW dbo.PatentFCites AS
SELECT
    patent_id,
    PatentAppYear,
    SUM(CASE WHEN CitingAppYear <= PatentAppYear +  3 THEN 1 ELSE 0 END) AS FCites_3year,
    SUM(CASE WHEN CitingAppYear <= PatentAppYear +  5 THEN 1 ELSE 0 END) AS FCites_5year,
    SUM(CASE WHEN CitingAppYear <= PatentAppYear + 10 THEN 1 ELSE 0 END) AS FCites_10year
FROM dbo.CitesWithYears
GROUP BY patent_id, PatentAppYear;
GO

-- 3.4 Rank within cohort & top-5% flags
IF OBJECT_ID(N'dbo.PatentFCitesRanked', N'V') IS NOT NULL DROP VIEW dbo.PatentFCitesRanked;
GO
CREATE VIEW dbo.PatentFCitesRanked AS
SELECT *,
       COUNT(*) OVER (PARTITION BY PatentAppYear) AS CohortSize,
       RANK()  OVER (PARTITION BY PatentAppYear ORDER BY FCites_3year  DESC, patent_id) AS Rank_3yr,
       RANK()  OVER (PARTITION BY PatentAppYear ORDER BY FCites_5year  DESC, patent_id) AS Rank_5yr,
       RANK()  OVER (PARTITION BY PatentAppYear ORDER BY FCites_10year DESC, patent_id) AS Rank_10yr
FROM dbo.PatentFCites;
GO

IF OBJECT_ID(N'dbo.PatentFCitesFlagged', N'V') IS NOT NULL DROP VIEW dbo.PatentFCitesFlagged;
GO
CREATE VIEW dbo.PatentFCitesFlagged AS
SELECT
    patent_id,
    PatentAppYear,
    FCites_3year, FCites_5year, FCites_10year,
    CASE WHEN Rank_3yr  <= CEILING(0.05 * CohortSize) THEN 1 ELSE 0 END AS Top5pct_3yr,
    CASE WHEN Rank_5yr  <= CEILING(0.05 * CohortSize) THEN 1 ELSE 0 END AS Top5pct_5yr,
    CASE WHEN Rank_10yr <= CEILING(0.05 * CohortSize) THEN 1 ELSE 0 END AS Top5pct_10yr
FROM dbo.PatentFCitesRanked;
GO


/* =========================================================
   4) CPC group pair stats (current year and lags 1–4)
   ========================================================= */

-- 4.1 Application_Citation (citing patent with its AppYear)
IF OBJECT_ID(N'dbo.Application_Citation', N'V') IS NOT NULL DROP VIEW dbo.Application_Citation;
GO
CREATE VIEW dbo.Application_Citation AS
SELECT
    A.patent_id,
    A.AppYear,
    C.citation_id
FROM dbo.PatentsData AS A
JOIN dbo.Patent_citations_byCiting_int AS C
  ON A.patent_id = C.patent_id;
GO

-- 4.2 Application_Citation_CpcC (CPC of the *cited* patent)
IF OBJECT_ID(N'dbo.Application_Citation_CpcC', N'V') IS NOT NULL DROP VIEW dbo.Application_Citation_CpcC;
GO
CREATE VIEW dbo.Application_Citation_CpcC AS
SELECT DISTINCT
    A.patent_id,
    A.AppYear,
    U.group_id
FROM dbo.Cpc_current_int AS U
JOIN dbo.Application_Citation AS A
  ON U.patent_id = A.citation_id
WHERE A.AppYear IS NOT NULL;
GO

-- 4.3 Pair up CPC groups per citing patent (lexical order)
IF OBJECT_ID(N'dbo.PatentRefsCPCCGroupPairs', N'V') IS NOT NULL DROP VIEW dbo.PatentRefsCPCCGroupPairs;
GO
CREATE VIEW dbo.PatentRefsCPCCGroupPairs AS
SELECT DISTINCT
    U1.patent_id,
    U1.AppYear,
    U1.group_id AS cpcgroup1,
    U2.group_id AS cpcgroup2
FROM dbo.Application_Citation_CpcC AS U1
JOIN dbo.Application_Citation_CpcC AS U2
  ON U1.patent_id = U2.patent_id
WHERE U1.group_id < U2.group_id;
GO

-- 4.4 Yearly pair frequency & cumulative frequency
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqPerYear', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqPerYear;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqPerYear AS
SELECT
    cpcgroup1,
    cpcgroup2,
    AppYear,
    COUNT(*) AS FreqPerYear
FROM dbo.PatentRefsCPCCGroupPairs
GROUP BY cpcgroup1, cpcgroup2, AppYear;
GO

IF OBJECT_ID(N'dbo.PatentRefFieldPairsCumFreq', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsCumFreq;
GO
CREATE VIEW dbo.PatentRefFieldPairsCumFreq AS
SELECT DISTINCT
    cpcgroup1,
    cpcgroup2,
    AppYear,
    SUM(FreqPerYear) OVER (
        PARTITION BY cpcgroup1, cpcgroup2
        ORDER BY AppYear ASC
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS CumFreq,
    FIRST_VALUE(FreqPerYear) OVER (
        PARTITION BY cpcgroup1, cpcgroup2
        ORDER BY AppYear ASC
    ) AS FirstYearFreq,
    FreqPerYear
FROM dbo.PatentRefFieldPairsFreqPerYear;
GO

-- 4.5 Current-year frequencies attached to each pair instance (COALESCE→0)
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqs', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqs;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqs AS
SELECT
    P.patent_id,
    P.AppYear,
    P.cpcgroup1,
    P.cpcgroup2,
    COALESCE(S.FreqPerYear, 0)                                AS FreqPerYear2,
    COALESCE(S.CumFreq - S.FreqPerYear, 0)                    AS LaggedCumFreq
FROM dbo.PatentRefsCPCCGroupPairs      AS P
LEFT JOIN dbo.PatentRefFieldPairsCumFreq AS S
  ON  S.cpcgroup1 = P.cpcgroup1
  AND S.cpcgroup2 = P.cpcgroup2
  AND S.AppYear   = P.AppYear;
GO

-- 4.6 Percentiles and counts (current year)
IF OBJECT_ID(N'dbo.PatentRefFieldPairsStats', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsStats;
GO
CREATE VIEW dbo.PatentRefFieldPairsStats AS
SELECT DISTINCT
    patent_id,
    AppYear,
    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYear2)
        OVER (PARTITION BY patent_id) AS MedianRefPairFreq,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYear2)
        OVER (PARTITION BY patent_id) AS Top10PRefPairFreq,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYear2)
        OVER (PARTITION BY patent_id) AS Top5PRefPairFreq,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYear2)
        OVER (PARTITION BY patent_id) AS Top90PRefPairFreq,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYear2)
        OVER (PARTITION BY patent_id) AS Top95PRefPairFreq,

    SUM(CASE WHEN FreqPerYear2 = 1 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelCohortPairsCount,

    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreq)
        OVER (PARTITION BY patent_id) AS MedianRefPairCumFreq,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreq)
        OVER (PARTITION BY patent_id) AS Top10PRefPairCumFreq,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreq)
        OVER (PARTITION BY patent_id) AS Top5PRefPairCumFreq,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreq)
        OVER (PARTITION BY patent_id) AS Top90PRefPairCumFreq,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreq)
        OVER (PARTITION BY patent_id) AS Top95PRefPairCumFreq,

    SUM(CASE WHEN LaggedCumFreq = 0 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelHistPairsCount
FROM dbo.PatentRefFieldPairsFreqs;
GO

/* ---------- LAG=1 .. 4 pipelines (NN→percentiles) ---------- */

-- Helper macro-ish: create (Lx view, Lx stats). Repeated for 1..4.

-- L1
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqsL1', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqsL1;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqsL1 AS
SELECT
    P.patent_id, P.AppYear, P.cpcgroup1, P.cpcgroup2,
    COALESCE(S.FreqPerYear, 0)                 AS FreqPerYearL1,
    COALESCE(S.CumFreq - S.FreqPerYear, 0)     AS LaggedCumFreqL1
FROM dbo.PatentRefsCPCCGroupPairs      AS P
LEFT JOIN dbo.PatentRefFieldPairsCumFreq AS S
  ON  S.cpcgroup1 = P.cpcgroup1
  AND S.cpcgroup2 = P.cpcgroup2
  AND S.AppYear   = P.AppYear - 1;
GO

IF OBJECT_ID(N'dbo.PatentRefFieldPairsStatsL1', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsStatsL1;
GO
CREATE VIEW dbo.PatentRefFieldPairsStatsL1 AS
SELECT DISTINCT
    patent_id,
    AppYear,
    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL1)
        OVER (PARTITION BY patent_id) AS MedianRefPairFreqL1,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYearL1)
        OVER (PARTITION BY patent_id) AS Top10PRefPairFreqL1,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL1)
        OVER (PARTITION BY patent_id) AS Top5PRefPairFreqL1,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYearL1)
        OVER (PARTITION BY patent_id) AS Top90PRefPairFreqL1,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL1)
        OVER (PARTITION BY patent_id) AS Top95PRefPairFreqL1,

    SUM(CASE WHEN FreqPerYearL1 = 1 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelCohortPairsCountL1,

    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL1)
        OVER (PARTITION BY patent_id) AS MedianRefPairCumFreqL1,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreqL1)
        OVER (PARTITION BY patent_id) AS Top10PRefPairCumFreqL1,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL1)
        OVER (PARTITION BY patent_id) AS Top5PRefPairCumFreqL1,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreqL1)
        OVER (PARTITION BY patent_id) AS Top90PRefPairCumFreqL1,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL1)
        OVER (PARTITION BY patent_id) AS Top95PRefPairCumFreqL1,

    SUM(CASE WHEN LaggedCumFreqL1 = 0 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelHistPairsCountL1
FROM dbo.PatentRefFieldPairsFreqsL1;
GO

-- L2
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqsL2', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqsL2;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqsL2 AS
SELECT
    P.patent_id, P.AppYear, P.cpcgroup1, P.cpcgroup2,
    COALESCE(S.FreqPerYear, 0)                 AS FreqPerYearL2,
    COALESCE(S.CumFreq - S.FreqPerYear, 0)     AS LaggedCumFreqL2
FROM dbo.PatentRefsCPCCGroupPairs      AS P
LEFT JOIN dbo.PatentRefFieldPairsCumFreq AS S
  ON  S.cpcgroup1 = P.cpcgroup1
  AND S.cpcgroup2 = P.cpcgroup2
  AND S.AppYear   = P.AppYear - 2;
GO

IF OBJECT_ID(N'dbo.PatentRefFieldPairsStatsL2', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsStatsL2;
GO
CREATE VIEW dbo.PatentRefFieldPairsStatsL2 AS
SELECT DISTINCT
    patent_id,
    AppYear,
    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL2)
        OVER (PARTITION BY patent_id) AS MedianRefPairFreqL2,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYearL2)
        OVER (PARTITION BY patent_id) AS Top10PRefPairFreqL2,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL2)
        OVER (PARTITION BY patent_id) AS Top5PRefPairFreqL2,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYearL2)
        OVER (PARTITION BY patent_id) AS Top90PRefPairFreqL2,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL2)
        OVER (PARTITION BY patent_id) AS Top95PRefPairFreqL2,

    SUM(CASE WHEN FreqPerYearL2 = 1 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelCohortPairsCountL2,

    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL2)
        OVER (PARTITION BY patent_id) AS MedianRefPairCumFreqL2,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreqL2)
        OVER (PARTITION BY patent_id) AS Top10PRefPairCumFreqL2,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL2)
        OVER (PARTITION BY patent_id) AS Top5PRefPairCumFreqL2,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreqL2)
        OVER (PARTITION BY patent_id) AS Top90PRefPairCumFreqL2,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL2)
        OVER (PARTITION BY patent_id) AS Top95PRefPairCumFreqL2,

    SUM(CASE WHEN LaggedCumFreqL2 = 0 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelHistPairsCountL2
FROM dbo.PatentRefFieldPairsFreqsL2;
GO

-- L3
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqsL3', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqsL3;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqsL3 AS
SELECT
    P.patent_id, P.AppYear, P.cpcgroup1, P.cpcgroup2,
    COALESCE(S.FreqPerYear, 0)                 AS FreqPerYearL3,
    COALESCE(S.CumFreq - S.FreqPerYear, 0)     AS LaggedCumFreqL3
FROM dbo.PatentRefsCPCCGroupPairs      AS P
LEFT JOIN dbo.PatentRefFieldPairsCumFreq AS S
  ON  S.cpcgroup1 = P.cpcgroup1
  AND S.cpcgroup2 = P.cpcgroup2
  AND S.AppYear   = P.AppYear - 3;
GO

IF OBJECT_ID(N'dbo.PatentRefFieldPairsStatsL3', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsStatsL3;
GO
CREATE VIEW dbo.PatentRefFieldPairsStatsL3 AS
SELECT DISTINCT
    patent_id,
    AppYear,
    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL3)
        OVER (PARTITION BY patent_id) AS MedianRefPairFreqL3,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYearL3)
        OVER (PARTITION BY patent_id) AS Top10PRefPairFreqL3,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL3)
        OVER (PARTITION BY patent_id) AS Top5PRefPairFreqL3,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYearL3)
        OVER (PARTITION BY patent_id) AS Top90PRefPairFreqL3,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL3)
        OVER (PARTITION BY patent_id) AS Top95PRefPairFreqL3,

    SUM(CASE WHEN FreqPerYearL3 = 1 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelCohortPairsCountL3,

    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL3)
        OVER (PARTITION BY patent_id) AS MedianRefPairCumFreqL3,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreqL3)
        OVER (PARTITION BY patent_id) AS Top10PRefPairCumFreqL3,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL3)
        OVER (PARTITION BY patent_id) AS Top5PRefPairCumFreqL3,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreqL3)
        OVER (PARTITION BY patent_id) AS Top90PRefPairCumFreqL3,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL3)
        OVER (PARTITION BY patent_id) AS Top95PRefPairCumFreqL3,

    SUM(CASE WHEN LaggedCumFreqL3 = 0 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelHistPairsCountL3
FROM dbo.PatentRefFieldPairsFreqsL3;
GO

-- L4
IF OBJECT_ID(N'dbo.PatentRefFieldPairsFreqsL4', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsFreqsL4;
GO
CREATE VIEW dbo.PatentRefFieldPairsFreqsL4 AS
SELECT
    P.patent_id, P.AppYear, P.cpcgroup1, P.cpcgroup2,
    COALESCE(S.FreqPerYear, 0)                 AS FreqPerYearL4,
    COALESCE(S.CumFreq - S.FreqPerYear, 0)     AS LaggedCumFreqL4
FROM dbo.PatentRefsCPCCGroupPairs      AS P
LEFT JOIN dbo.PatentRefFieldPairsCumFreq AS S
  ON  S.cpcgroup1 = P.cpcgroup1
  AND S.cpcgroup2 = P.cpcgroup2
  AND S.AppYear   = P.AppYear - 4;
GO

IF OBJECT_ID(N'dbo.PatentRefFieldPairsStatsL4', N'V') IS NOT NULL DROP VIEW dbo.PatentRefFieldPairsStatsL4;
GO
CREATE VIEW dbo.PatentRefFieldPairsStatsL4 AS
SELECT DISTINCT
    patent_id,
    AppYear,
    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL4)
        OVER (PARTITION BY patent_id) AS MedianRefPairFreqL4,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYearL4)
        OVER (PARTITION BY patent_id) AS Top10PRefPairFreqL4,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL4)
        OVER (PARTITION BY patent_id) AS Top5PRefPairFreqL4,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYearL4)
        OVER (PARTITION BY patent_id) AS Top90PRefPairFreqL4,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL4)
        OVER (PARTITION BY patent_id) AS Top95PRefPairFreqL4,

    SUM(CASE WHEN FreqPerYearL4 = 1 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelCohortPairsCountL4,

    PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL4)
        OVER (PARTITION BY patent_id) AS MedianRefPairCumFreqL4,
    PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreqL4)
        OVER (PARTITION BY patent_id) AS Top10PRefPairCumFreqL4,
    PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL4)
        OVER (PARTITION BY patent_id) AS Top5PRefPairCumFreqL4,
    PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreqL4)
        OVER (PARTITION BY patent_id) AS Top90PRefPairCumFreqL4,
    PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL4)
        OVER (PARTITION BY patent_id) AS Top95PRefPairCumFreqL4,

    SUM(CASE WHEN LaggedCumFreqL4 = 0 THEN 1 ELSE 0 END)
        OVER (PARTITION BY patent_id) AS NovelHistPairsCountL4
FROM dbo.PatentRefFieldPairsFreqsL4;
GO


/* =========================================================
   5) Final merged view and export
   ========================================================= */

IF OBJECT_ID(N'dbo.PatentsDataMergedView', N'V') IS NOT NULL
    DROP VIEW dbo.PatentsDataMergedView;
GO
CREATE VIEW dbo.PatentsDataMergedView AS
SELECT
    PD.*,

    -- forward-citation counts & top-5% flags (NULL -> 0)
    COALESCE(FC.FCites_3year ,0) AS FCites_3year,
    COALESCE(FC.FCites_5year ,0) AS FCites_5year,
    COALESCE(FC.FCites_10year,0) AS FCites_10year,
    COALESCE(FC.Top5pct_3yr  ,0) AS Top5pct_3yr,
    COALESCE(FC.Top5pct_5yr  ,0) AS Top5pct_5yr,
    COALESCE(FC.Top5pct_10yr ,0) AS Top5pct_10yr,

    -- L1
    S1.MedianRefPairFreqL1,
    S1.Top10PRefPairFreqL1,
    S1.Top5PRefPairFreqL1,
    S1.Top90PRefPairFreqL1,
    S1.Top95PRefPairFreqL1,
    S1.MedianRefPairCumFreqL1,
    S1.Top10PRefPairCumFreqL1,
    S1.Top5PRefPairCumFreqL1,
    S1.Top90PRefPairCumFreqL1,
    S1.Top95PRefPairCumFreqL1,

    -- L2
    S2.MedianRefPairFreqL2,
    S2.Top10PRefPairFreqL2,
    S2.Top5PRefPairFreqL2,
    S2.Top90PRefPairFreqL2,
    S2.Top95PRefPairFreqL2,
    S2.MedianRefPairCumFreqL2,
    S2.Top10PRefPairCumFreqL2,
    S2.Top5PRefPairCumFreqL2,
    S2.Top90PRefPairCumFreqL2,
    S2.Top95PRefPairCumFreqL2,

    -- L3
    S3.MedianRefPairFreqL3,
    S3.Top10PRefPairFreqL3,
    S3.Top5PRefPairFreqL3,
    S3.Top90PRefPairFreqL3,
    S3.Top95PRefPairFreqL3,
    S3.MedianRefPairCumFreqL3,
    S3.Top10PRefPairCumFreqL3,
    S3.Top5PRefPairCumFreqL3,
    S3.Top90PRefPairCumFreqL3,
    S3.Top95PRefPairCumFreqL3,

    -- L4
    S4.MedianRefPairFreqL4,
    S4.Top10PRefPairFreqL4,
    S4.Top5PRefPairFreqL4,
    S4.Top90PRefPairFreqL4,
    S4.Top95PRefPairFreqL4,
    S4.MedianRefPairCumFreqL4,
    S4.Top10PRefPairCumFreqL4,
    S4.Top5PRefPairCumFreqL4,
    S4.Top90PRefPairCumFreqL4,
    S4.Top95PRefPairCumFreqL4
FROM dbo.PatentsData                      AS PD
LEFT JOIN dbo.PatentFCitesFlagged         AS FC ON PD.patent_id = FC.patent_id
LEFT JOIN dbo.PatentRefFieldPairsStatsL1  AS S1 ON PD.patent_id = S1.patent_id
LEFT JOIN dbo.PatentRefFieldPairsStatsL2  AS S2 ON PD.patent_id = S2.patent_id
LEFT JOIN dbo.PatentRefFieldPairsStatsL3  AS S3 ON PD.patent_id = S3.patent_id
LEFT JOIN dbo.PatentRefFieldPairsStatsL4  AS S4 ON PD.patent_id = S4.patent_id;
GO


/* =========================================================
   6) Materialize & export to Parquet
   ========================================================= */

IF OBJECT_ID(N'dbo.PatentsDataMerged_int', N'U') IS NOT NULL
    DROP TABLE dbo.PatentsDataMerged_int;
GO
CREATE TABLE dbo.PatentsDataMerged_int
WITH (
    DISTRIBUTION = HASH(patent_id),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT *
FROM dbo.PatentsDataMergedView
OPTION (LABEL = 'Build_PatentsDataMerged_int');
GO

-- External CTAS (adjust LOCATION as you like)
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'PatentsDataMerged' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.PatentsDataMerged;');
GO
CREATE EXTERNAL TABLE dbo.PatentsDataMerged
WITH (
    LOCATION    = '/Output/patents/run1/PatentsDataMerged/',
    DATA_SOURCE = MyBlob,
    FILE_FORMAT = ParquetFmt
)
AS
SELECT *
FROM dbo.PatentsDataMerged_int
OPTION (LABEL = 'Export_PatentsDataMerged');
GO
